Taking the radix R package or a test spin with Sckit Learn!
both Python & R, reticulate packagesleek web-based reporting, new radix packagemachine learning algorithms and pipeline, sklearn libraryexcellent visualaization capabilities, ggplot2 & plotlyTake advantage of he new radix R package for sleek scientific reporting
library(reticulate)
conda_list()
name
1 r-miniconda
2 r-gluonts
3 r-mlflow-1.16.0
4 r-mlflow-1.18.0
5 r-reticulate
6 r-tf
7 anaconda3
8 r-reticulate
python
1 /Users/gouthaman/Library/r-miniconda/bin/python
2 /Users/gouthaman/Library/r-miniconda/envs/r-gluonts/bin/python
3 /Users/gouthaman/Library/r-miniconda/envs/r-mlflow-1.16.0/bin/python
4 /Users/gouthaman/Library/r-miniconda/envs/r-mlflow-1.18.0/bin/python
5 /Users/gouthaman/Library/r-miniconda/envs/r-reticulate/bin/python
6 /Users/gouthaman/Library/r-miniconda/envs/r-tf/bin/python
7 /Users/gouthaman/opt/anaconda3/bin/python
8 /Volumes/Data Science/Data Science Projects/bs-R-tips/renv/python/r-reticulate/bin/python
use_condaenv("anaconda3")
https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import load_wine
wine = load_wine()
data = pd.DataFrame(data = np.c_[wine["data"], wine["target"]],
columns= wine["feature_names"] + ["target"])
library(tidyverse)
py$data %>%
as.tibble() %>%
glimpse()
Rows: 178
Columns: 14
$ alcohol <dbl> 14.23, 13.20, 13.16, 14.37, 1…
$ malic_acid <dbl> 1.71, 1.78, 2.36, 1.95, 2.59,…
$ ash <dbl> 2.43, 2.14, 2.67, 2.50, 2.87,…
$ alcalinity_of_ash <dbl> 15.6, 11.2, 18.6, 16.8, 21.0,…
$ magnesium <dbl> 127, 100, 101, 113, 118, 112,…
$ total_phenols <dbl> 2.80, 2.65, 2.80, 3.85, 2.80,…
$ flavanoids <dbl> 3.06, 2.76, 3.24, 3.49, 2.69,…
$ nonflavanoid_phenols <dbl> 0.28, 0.26, 0.30, 0.24, 0.39,…
$ proanthocyanins <dbl> 2.29, 1.28, 2.81, 2.18, 1.82,…
$ color_intensity <dbl> 5.64, 4.38, 5.68, 7.80, 4.32,…
$ hue <dbl> 1.04, 1.05, 1.03, 0.86, 1.04,…
$ `od280/od315_of_diluted_wines` <dbl> 3.92, 3.40, 3.17, 3.45, 2.93,…
$ proline <dbl> 1065, 1050, 1185, 1480, 735, …
$ target <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
Setup data into X (features) & Y (target) variables
y = data.target
X = data.drop("target", axis=1)
Split features into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size = 0.2,
random_state = 1987,
stratify = y
)
Preprocess by scaling X_train
scaler = preprocessing.StandardScaler().fit(X_train)
Apply transformation to X_test
X_test_scaled = scaler.transform(X_test)
Setup ML Pipeline
pipeline = make_pipeline(
preprocessing.StandardScaler(),
RandomForestRegressor(n_estimators = 100)
)
Setup Grid Search
hyperparameters = {
"randomforestregressor__max_features" : ["auto", "sqrt", "log2"],
"randomforestregressor__max_depth" : [None, 5, 3, 1]
}
Apply grid search with CV.
clf = GridSearchCV(pipeline, hyperparameters, cv = 10)
clf.fit(X_train, y_train)
GridSearchCV(cv=10,
estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
('randomforestregressor',
RandomForestRegressor())]),
param_grid={'randomforestregressor__max_depth': [None, 5, 3, 1],
'randomforestregressor__max_features': ['auto', 'sqrt',
'log2']})
Get best parameters
print(clf.best_params_)
{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'sqrt'}
y_pred = clf.predict(X_test)
print(r2_score(y_test, y_pred))
0.9618984771573604
print(mean_squared_error(y_test, y_pred))
0.023166666666666665
Plot each predictions & actual data pair from the test set arranged by quality level to visualize how our model performing
library(tidyverse)
library(tidyquant)
library(plotly)
results_tbl <- tibble(y_pred = py$y_pred, y_test = py$y_test) %>%
rowid_to_column() %>%
arrange(y_test) %>%
mutate(rowid = as_factor(as.character(rowid))) %>%
rowid_to_column("sorted_rowid") %>%
pivot_longer(cols = -contains("rowid"), values_to = "value", names_to = "key")
(results_tbl %>%
ggplot(aes(rowid, value, color = key)) +
geom_point(size = 0.5) +
geom_smooth(aes(sorted_rowid, value)) +
theme_tq() +
scale_color_tq() +
labs(title = "Wine Quality Level Predictions",
x = "Row ID", y = "Quality Level")) %>%
ggplotly()